Mapping Global Conflict

We will explore the data set from the Upsalla Data Conflict Program.

To start, download and clean the data. Then some light analysis and plotting. Finally, we will make a prediction using the insights we’ve gained.

Download Data and Prep

This is the nitty gritty of getting the data and putting it into a usable form. An important step, but a tedious one to read through. Feel free to skip ahead.

# get data
prac <- fromJSON('https://ucdpapi.pcr.uu.se/api/gedevents/20.1?pagesize=1000&StartDate=2019-01-01&EndDate=2019-12-31')

# extract results list
result <- prac$Result

# Null to NA function to apply through
null_to_na <- function(x) {
    
    for(i in 1:length(x))
        if(is.null(x[[i]])){
            x[[i]] <- NA
        } else {
            next
        }
    return(x)
    
}

# Set nulls to NA
result <- lapply(result, null_to_na)

# Initialize data frame with first element of results
c_df<- data.frame(result[[1]])

# Add the rest w/loop
for (i in 2:length(result)){
    c_df <- rbind(c_df, data.frame(result[[i]]))
}

# get next URL
URL <- prac$NextPageUrl
url_list <- c(URL, rep(NA, 50))

# Get list of URL's
for( i  in 2:39){
    listing <- fromJSON(URL)
    if (listing$NextPageUrl != ""){
        url_list[i] <- listing$NextPageUrl
        Sys.sleep(.2)
        URL <- listing$NextPageUrl
    } else {
        break
    }
}

# Manually subset to remove NA's
url_list <- url_list[!is.na(url_list)]

# This function should take a vector of URL's that return JSON, and
# give back data frames of data
get_all_data <- function(x) {
    # get Data
    data <- fromJSON(x)
    
    # subset Data
    data <- data$Result
    
    # Turn Nulls to NA's
    data <- lapply(data, null_to_na)
    
    # Initialize data frame with first element of results
    c_df<- data.frame(data[[1]])
    
    # Add the rest w/loop
    for (i in 2:length(data)){
        c_df <- rbind(c_df, data.frame(data[[i]]))
    }
    
    return(c_df)
}

# lappy over our URL with get_all_data
yes <- lapply(url_list, get_all_data)

# Collapse list of df's to single df
yes <- bind_rows(yes)

# Add page 1
c_df <- rbind(c_df, yes)

# As tibble
c_df <- c_df %>% 
    as_tibble(c_df)

# Create a civilian deaths categorical variable
c_df <- c_df %>% 
    mutate(civ_cat = case_when(deaths_civilians > 0 ~ 'yes', TRUE ~ 'no'))

# Save file for later use
#save(c_df, "data/conflict_19.Rds")
#load(file = "data/conflict_19.Rds")

Explore

Let’s look at some stuff.

Tables

Random Subset

set.seed(42)
kable(c_df %>%
        filter(best > 10) %>% 
        sample_n(5) %>%
        select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>% 
        arrange(date_start), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'Random Subset')
Random Subset
Date Conflict Location Side A Side B Deaths A Deaths B Civilian Causualties
2019-03-12T00:00:00 Nigeria Government of Nigeria IS 0 22 no
2019-05-14T00:00:00 Afghanistan Government of Afghanistan Taleban 0 17 no
2019-09-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 no
2019-09-19T00:00:00 DR Congo (Zaire) Government of DR Congo (Zaire) CMC 23 0 no
2019-10-03T00:00:00 Afghanistan Government of Afghanistan Taleban 0 24 no

Most Events By Country

kable(c_df %>%
        add_count(country) %>% 
        group_by(country) %>% 
        summarise(country = country, total_estimated = sum(best), n = n) %>%
        distinct() %>% 
        select(country, n, total_estimated) %>% 
        arrange(desc(n)) %>% 
        head(10), col.names = c('Conflict Location', 'Total Events', 'Total Deaths'), align = 'c', caption = 'Most Deaths')
Most Deaths
Conflict Location Total Events Total Deaths
Afghanistan 4682 30434
Syria 2242 10931
Mexico 786 11789
Nigeria 509 2437
Somalia 426 2221
DR Congo (Zaire) 416 2393
India 352 728
Brazil 263 1296
Iraq 246 803
Cameroon 239 858

Afghanistan had far and away the most conflicts, followed by Syria and Mexico.

Events with the Largest Death Counts

kable(c_df %>% 
        select(date_start, country, side_a, side_b, deaths_a, deaths_b, deaths_unknown, best, civ_cat) %>%
        arrange(desc(best)) %>%
        head(10), 
        col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Unknown Deaths', 'Best Estimate for Total Deaths', 'Civilian Causualties'), 
        align = 'c', 
        caption = 'Largest Death Count per Conflict Event in 2019')
Largest Death Count per Conflict Event in 2019
Date Conflict Location Side A Side B Deaths A Deaths B Unknown Deaths Best Estimate for Total Deaths Civilian Causualties
2019-01-01T00:00:00 Brazil Comando Vermelho GDE 0 0 739 739 no
2019-10-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 278 278 no
2019-12-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 272 272 no
2019-12-01T00:00:00 Mexico Jalisco Cartel New Generation La Familia 0 0 266 266 no
2019-09-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 250 250 no
2019-11-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 245 245 no
2019-10-01T00:00:00 Mexico Jalisco Cartel New Generation Sinaloa Cartel 0 0 231 231 no
2019-03-19T00:00:00 Syria IS SDF 20 0 165 230 yes
2019-07-01T00:00:00 Mexico Jalisco Cartel New Generation La Familia 0 0 229 229 no
2019-02-01T00:00:00 Mexico Jalisco Cartel New Generation Santa Rosa de Lima Cartel 0 0 226 226 no

We notice a few things from the table above. Mexico had many deadly conflict events in 2019, their dates are truncated to the first of the month, and nearly all deaths are classified as ‘unknown’. This leads me to believe that the deaths are not one single conflict event, but instead a collection of smaller events that are aggregated and reported at the end of the month.

USA Involved

# USA involved
kable(c_df %>%
    filter(side_b_new_id == 769 | side_a_new_id == 769) %>% 
        select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>% 
        arrange(date_start), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'USA Directly Involved')
USA Directly Involved
Date Conflict Location Side A Side B Deaths A Deaths B Civilian Causualties
2019-04-07T00:00:00 Afghanistan Government of United States of America al-Qaida 0 2 no
2019-05-21T00:00:00 Afghanistan Government of United States of America al-Qaida 0 2 no
2019-06-29T00:00:00 Afghanistan Government of United States of America al-Qaida 0 2 no
2019-06-29T00:00:00 Afghanistan Government of United States of America al-Qaida 0 1 no
2019-07-30T00:00:00 Afghanistan Government of United States of America al-Qaida 0 9 no
2019-07-30T00:00:00 Afghanistan Government of United States of America al-Qaida 0 4 no
2019-07-30T00:00:00 Afghanistan Government of United States of America al-Qaida 0 8 no
2019-09-07T00:00:00 Afghanistan Government of United States of America al-Qaida 0 9 no

Doesn’t appear to be a ton of direct US involvement in the 19 year old conflict with the Taliban.

# USA involved
kable(c_df %>%
    filter(side_a_new_id == 130 | side_a_new_id == 130) %>% 
        select(date_start, country, side_a, side_b, deaths_a, deaths_b, civ_cat) %>% 
        arrange(desc(deaths_b)) %>% 
        head(10), col.names = c('Date', 'Conflict Location', 'Side A', 'Side B', 'Deaths A', 'Deaths B', 'Civilian Causualties'), align = 'c', caption = 'Afghanistan Military')
Afghanistan Military
Date Conflict Location Side A Side B Deaths A Deaths B Civilian Causualties
2019-08-31T00:00:00 Afghanistan Government of Afghanistan Taleban 0 100 no
2019-04-06T00:00:00 Afghanistan Government of Afghanistan Taleban 24 99 no
2019-03-22T00:00:00 Afghanistan Government of Afghanistan Taleban 12 94 yes
2019-09-04T00:00:00 Afghanistan Government of Afghanistan Taleban 0 93 no
2019-03-26T00:00:00 Afghanistan Government of Afghanistan IS 0 87 no
2019-09-06T00:00:00 Afghanistan Government of Afghanistan Taleban 0 85 no
2019-09-14T00:00:00 Afghanistan Government of Afghanistan Taleban 0 84 no
2019-09-28T00:00:00 Afghanistan Government of Afghanistan Taleban 0 61 no
2019-11-24T00:00:00 Afghanistan Government of Afghanistan Taleban 8 60 no
2019-06-19T00:00:00 Afghanistan Government of Afghanistan Taleban 0 59 no

So it appears the US itself isn’t as active as the Afghanistan state military, however considering the US spends 38 billion (with a b) dollars in 2019 alone, well…

Civilian Deaths Globally

# How many?
kable(c_df %>% 
    count(civ_cat) %>% 
    arrange(n), caption = 'Civilian Deaths', col.names = c('Civilian Deaths', 'Number of Events'))
Civilian Deaths
Civilian Deaths Number of Events
yes 2795
no 9689

Including Plots

Average Deaths Per Conflict

# Okay, loving this plot, group by country and rounded dates, sum deaths, plot
# Filtering by more than 300 events here.
c_df %>% 
    add_count(country) %>% 
    filter(n > 300) %>% 
    mutate(rounded_date = floor_date(as.Date(date_start), unit = 'month')) %>% 
    group_by(country, rounded_date) %>% 
    mutate(sum_deaths = sum(best)) %>% 
    select(country, sum_deaths, rounded_date) %>% 
    ggplot(aes(rounded_date, sum_deaths, color = country)) +
    geom_line() +
    theme(axis.text.x = element_text(angle = 45, vjust = .5), legend.position = 'none') +
    labs(x = 'Dates', y = 'Total Deaths', title = 'Deaths Per Month') +
    facet_wrap(~country, scales = 'free_y')

# Okay, really cool plot. Doesn't show st dev, but shows average deaths, plus sample
# size, ordered by most
c_df %>% 
    select(country, best, date_start, region, best) %>% 
    add_count(country) %>% 
    filter(n > 100) %>% 
    group_by(country) %>% 
    mutate(average = (sum(best)/n), sum_deaths = sum(best)) %>% 
    select(-best, -date_start) %>% 
    # Seems like a cheater way to limit number of rows
    summarise(region = unique(region),
          n = max(n), 
          average = max(average),
          sum_deaths = max(sum_deaths)) %>% 
    mutate(country = fct_reorder(country, -average)) %>% 
    ggplot(aes(country, average, fill = country))+
    geom_col() +
    geom_text(aes(label = paste0('n= ', n), angle = 90), hjust = 'top') +
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position =  'none') +
    labs(x = 'Country', y = 'Average Deaths per Violent Encounter 2019') +
    scale_y_continuous(n.breaks = 8)

Like we mention above, Mexico’s death counts per event are suspicious. There might be some aggregation going on.

Deaths per Conflict

# This is beautiful. Easy ggplot for histogram of deaths, w/ Region Facet
c_df %>%
    filter(best < 50) %>%
    ggplot(aes(x = best)) +
    geom_histogram(bins = 50, aes(fill=region)) + 
    facet_wrap(~region) +
    xlab('Deaths Best Estimate') +
    labs(title = 'Are some regions events more deadly?', y ='Number of Events', x='Number of Deaths')

They appear to be the same.

# OMG THIS IS IT. THE WORLD MAP! Thank you JULIA SILGE! 
world <- map_data('world')

# 
ggplot() +
    geom_map(data = world, map = world, aes(long, lat, map_id = region), 
             color = 'white', fill = 'gray50', alpha = .2) +
    geom_point(data = c_df, aes(longitude, latitude, color = as.factor(type_of_violence)), 
               alpha = .8) +
    theme(legend.title = element_text('Type of Violence')) +
    labs(title = 'Global Violence') +
    scale_color_brewer(palette = 'Set1', labels = c('State Based', 'Non-State', 'One-Sided')) +
    guides(color = guide_legend('Type of Violence'))

# Okay, kinda like, civilian deaths mapped globally
ggplot() +
    geom_map(data = world, map = world, aes(long, lat, map_id = region), 
             color = 'white', fill = 'gray50', alpha = .3) +
    geom_point(data = c_df, aes(longitude, latitude, 
                                color = as.factor(civ_cat), 
                                group = id), alpha = .8) +
    theme() +
    scale_color_brewer(palette = 'Dark2') +
    labs(title = 'Conflict Civilian Deaths 2019', x ='', y= '') +
    guides(color = guide_legend('Civilian Deaths'))

Lets look at January 2019 for a few select countries.

# Set up data frame
# January only, Afghanistan only, change to dates, select specific columns
c_jan_19_af <- c_df %>% 
    filter(country == 'Afghanistan') %>%
    select(id, best, latitude, longitude, side_a, side_b, date_start, date_end) %>% 
    mutate(date_start = as.Date(date_start), date_end = as.Date(date_end)) %>% 
    filter(date_start <= '2019-1-31' & date_start >= '2019-1-1') 

# Get shapefile -- https://hub.arcgis.com/datasets/2b63527870ef416bacf83bcaf388685f_0/data
afg_sf <- read_sf('afghanistan')


# Beautiful, needed ids and frame for plotly. Frame needs to be in numeric or prob character
# maybe as.Date as.char
afg <- 
    ggplot(data = c_jan_19_af) +
    geom_sf(data = afg_sf, fill = 'gray50', alpha =.1) +
    geom_point(data = c_jan_19_af, aes(longitude, latitude, ids = id,
                                       frame = as.character(date_start)),
               alpha = .8, color = 'darkred', size = 1) +
    labs(x = 'Longitude', 
         y = 'Latitude',
         title = 'Conflict in Afghanistan January 2019')

# Plotly instead?
afg <- ggplotly(afg, width = 500, height = 500)

# Need to re-run to see if this fixes sizing - 
afg %>% 
    animation_opts(1000, easing = "linear", redraw = FALSE) 
# Set up data frame
# January only, India only, change to dates, select specific columns
c_jan_19_in <- c_df %>% 
    filter(country == 'India') %>%
    select(id, best, latitude, longitude, side_a, side_b, date_start, date_end) %>% 
    mutate(date_start = as.Date(date_start), date_end = as.Date(date_end)) %>% 
    filter(date_start <= '2019-1-31' & date_start >= '2019-1-1') 

# Get shapefile -- https://hub.arcgis.com/datasets/2b37b84e67374fb98577c20ef8be6c62_0
india_sf <- read_sf('india')

# Beautiful, needed ids and frame for plotly. Frame needs to be in numeric or prob character
# maybe as.Date as.char
ind <- 
    ggplot(data = c_jan_19_in) +
    geom_sf(data = india_sf, fill = 'gray50', alpha =.1) +
    geom_point(data = c_jan_19_in, aes(longitude, latitude, ids = id,
                                       frame = as.character(date_start)),
               alpha = .8, color = 'darkred', size = 1) +
    labs(x = 'Longitude', 
         y = 'Latitude',
         title = 'Conflict in India January 2019')

# Plotly instead?
ind <- ggplotly(ind, width = 500, height = 500)

# Add opts
ind %>% 
    animation_opts(1000, easing = "linear", redraw = FALSE)